NAMES: Madeline Abbott, Aidan Teppema, Daisy Cho
We began our exploratory analysis of the data by creating plots of potential variables of interest (such as hurricane frequency, category frequency, maximum wind speed, NAO index, and CO2 levels) over time. We looked for trends in these variables over time–trends that we could potentially use in our model…
======= <<<<<<< HEADWe began our exploratory analysis of the data by creating plots of potential variables of interest (such as hurricane frequency, category frequency, maximum wind speed, NAO index, and CO2 levels) over time. We looked for trends in these variables over time–trends that we could potentially use in our model. This visuals help exlain the variables and we plan to adapt many of these plots to include them in our final report. Maybe even in a shiny ap!
=======We began our exploratory analysis of the data by creating plots of potential variables of interest (such as hurricane frequency, category frequency, maximum wind speed, NAO index) over time. We looked for trends in these variables over time–trends that we could potentially use in our model…
>>>>>>> 2cf57e92ab75bb271078cf20c211637c4172c6fc >>>>>>> 48722bde6780321fa48cb715ace959574fced67ahurricane_data <- read_csv("Historical_Tropical_Storm_Tracks.csv")
## Parsed with column specification:
## cols(
## FID = col_integer(),
## YEAR = col_integer(),
## MONTH = col_integer(),
## DAY = col_integer(),
## AD_TIME = col_character(),
## BTID = col_integer(),
## NAME = col_character(),
## LAT = col_double(),
## LONG = col_double(),
## WIND_KTS = col_integer(),
## PRESSURE = col_integer(),
## CAT = col_character(),
## BASIN = col_character(),
## Shape_Leng = col_double()
## )
hurricane_data<-na.omit(hurricane_data)
hurricane_data<-subset(hurricane_data, select=-c(BTID, AD_TIME))
#transform(hurricane_data, char = as.numeric(LAT))
#transform(hurricane_data, char = as.numeric(LONG))
hurricane_data$LAT <- as.numeric(hurricane_data$LAT)
hurricane_data$LONG <- as.numeric(hurricane_data$LONG)
dim(hurricane_data)
## [1] 59228 12
names(hurricane_data)
## [1] "FID" "YEAR" "MONTH" "DAY" "NAME"
## [6] "LAT" "LONG" "WIND_KTS" "PRESSURE" "CAT"
## [11] "BASIN" "Shape_Leng"
head(hurricane_data)
## # A tibble: 6 x 12
## FID YEAR MONTH DAY NAME LAT LONG WIND_KTS PRESSURE CAT
## <int> <int> <int> <int> <chr> <dbl> <dbl> <int> <int> <chr>
## 1 2001 1957 8 8 NOTNAMED 22.5 -140.0 50 0 TS
## 2 2002 1961 10 3 PAULINE 22.1 -140.2 45 0 TS
## 3 2003 1962 8 29 C 18.0 -140.0 45 0 TS
## 4 2004 1967 7 14 DENISE 16.6 -139.5 45 0 TS
## 5 2005 1972 8 16 DIANA 18.5 -139.8 70 0 H1
## 6 2006 1976 7 22 DIANA 18.6 -139.8 30 0 TD
## # ... with 2 more variables: BASIN <chr>, Shape_Leng <dbl>
summary(hurricane_data)
## FID YEAR MONTH DAY
## Min. : 1 Min. :1851 Min. : 1.000 Min. : 1.00
## 1st Qu.:14808 1st Qu.:1928 1st Qu.: 8.000 1st Qu.: 8.00
## Median :29614 Median :1970 Median : 9.000 Median :16.00
## Mean :29614 Mean :1957 Mean : 8.541 Mean :15.87
## 3rd Qu.:44421 3rd Qu.:1991 3rd Qu.: 9.000 3rd Qu.:23.00
## Max. :59228 Max. :2008 Max. :12.000 Max. :31.00
## NAME LAT LONG WIND_KTS
## Length:59228 Min. : 4.20 Min. :-180.0 Min. : 10.00
## Class :character 1st Qu.:16.10 1st Qu.:-108.5 1st Qu.: 35.00
## Mode :character Median :21.20 Median : -81.2 Median : 50.00
## Mean :23.53 Mean : -83.2 Mean : 54.73
## 3rd Qu.:29.60 3rd Qu.: -62.2 3rd Qu.: 70.00
## Max. :69.00 Max. : 180.0 Max. :165.00
## PRESSURE CAT BASIN Shape_Leng
## Min. : 0.0 Length:59228 Length:59228 Min. : 0.0000
## 1st Qu.: 0.0 Class :character Class :character 1st Qu.: 0.7071
## Median : 0.0 Mode :character Mode :character Median : 1.0296
## Mean : 372.3 Mean : 1.2020
## 3rd Qu.: 990.0 3rd Qu.: 1.4318
## Max. :1024.0 Max. :11.1803
# select only the hurricanes after 1950 (this is when the naming system started)
hurricanes_recent <- hurricane_data %>%
filter(YEAR > 1949)
Some plots…
# Plotting hurricane denisty by location
get_density <- function(x, y, n = 100) {
dens <- MASS::kde2d(x = x, y = y, n = n)
ix <- findInterval(x, dens$x)
iy <- findInterval(y, dens$y)
ii <- cbind(ix, iy)
return(dens$z[ii])
}
hurricanes_recent$density <- get_density(hurricanes_recent$LONG, hurricanes_recent$LAT)
# creating a sample data.frame with your lat/lon points
hurricanes_recent$LAT <- as.numeric(hurricanes_recent$LAT)
hurricanes_recent$LONG <- as.numeric(hurricanes_recent$LONG)
#transform(hurricanes_recent, char = as.numeric(LAT))
#transform(hurricanes_recent, char = as.numeric(LONG))
# getting the map
map_hurricanes <- get_map(location = c(lon = mean(hurricanes_recent$LONG), lat = mean(hurricanes_recent$LAT)), zoom = 2, maptype = "terrain", scale = 2)
## Source : https://maps.googleapis.com/maps/api/staticmap?center=21.893561,-89.486682&zoom=2&size=640x640&scale=2&maptype=terrain&language=en-EN
# plotting the map with some points on it
ggmap(map_hurricanes) +
geom_point(data = hurricanes_recent, aes(x = LONG, y = LAT, alpha = 0.1, color = density)) + scale_color_viridis() + guides(size=FALSE)
## Warning: Removed 421 rows containing missing values (geom_point).
# Total number of hurricanes per year
hurricanes_per_year <- group_by(hurricanes_recent, YEAR) %>%
summarize(length(unique(NAME)))
colnames(hurricanes_per_year)[2] <- "TOTAL_HURRICANES"
ggplot(hurricanes_per_year, aes(x = YEAR, y = TOTAL_HURRICANES)) + geom_point()
# Total number of hurricanes of each category per year
hurricanes_categories <- group_by(hurricanes_recent, YEAR) %>%
count(CAT)
colnames(hurricanes_categories)[3] <- "TOTAL_HURRICANES"
ggplot(hurricanes_categories, aes(x = YEAR, y = TOTAL_HURRICANES, color = CAT, fill = CAT)) + geom_bar(stat = "identity")
ggplot(hurricanes_categories, aes(x = YEAR, y = TOTAL_HURRICANES, color = CAT, fill = CAT)) + geom_bar(stat = "identity", position = position_dodge(width = 0.9))
# Average pressure by year
hurricane_pressure <- group_by(hurricanes_recent, YEAR) %>%
summarize(mean(PRESSURE))
colnames(hurricane_pressure)[2] <- 'AVG_PRESSURE'
colnames(hurricane_pressure)[1] <- "YEAR"
ggplot(hurricane_pressure, aes(x = YEAR, y = AVG_PRESSURE)) + geom_point() + ylab("Average pressure")
# Average max wind speed by year
hurricanes_wind <- group_by(hurricanes_recent, YEAR) %>%
summarize(mean(WIND_KTS))
colnames(hurricanes_wind)[2] <- "AVG_WIND"
ggplot(hurricanes_wind, aes(x = YEAR, y = AVG_WIND)) + geom_point() + ylab("Average wind speed (kts)")
# If you want to save a plot...
#ggsave("hurri_categories2.pdf", plot = last_plot(), height = 5, width = 10)
temp_data <- read.csv("ZonAnn.csv")
temp_data[!complete.cases(temp_data),]
## [1] Year Glob NHem SHem X24N.90N X24S.24N X90S.24S
## [8] X64N.90N X44N.64N X24N.44N EQU.24N X24S.EQU X44S.24S X64S.44S
## [15] X90S.64S
## <0 rows> (or 0-length row.names)
temp_data<-na.omit(temp_data)
temp_data<-temp_data[,-(5:15),drop=FALSE]
temp_recent <- temp_data %>%
filter(Year > 1949)
dim(temp_recent)
## [1] 67 4
names(temp_recent)
## [1] "Year" "Glob" "NHem" "SHem"
head(temp_recent)
## Year Glob NHem SHem
## 1 1950 -0.18 -0.18 -0.20
## 2 1951 -0.06 0.04 -0.17
## 3 1952 0.01 0.05 -0.03
## 4 1953 0.07 0.23 -0.08
## 5 1954 -0.14 -0.04 -0.25
## 6 1955 -0.14 -0.11 -0.20
summary(temp_recent)
## Year Glob NHem SHem
## Min. :1950 Min. :-0.2000 Min. :-0.2600 Min. :-0.2500
## 1st Qu.:1966 1st Qu.: 0.0200 1st Qu.: 0.0250 1st Qu.:-0.0250
## Median :1983 Median : 0.1800 Median : 0.1700 Median : 0.2600
## Mean :1983 Mean : 0.2536 Mean : 0.2901 Mean : 0.2161
## 3rd Qu.:2000 3rd Qu.: 0.4950 3rd Qu.: 0.6150 3rd Qu.: 0.4150
## Max. :2016 Max. : 0.9900 Max. : 1.2800 Max. : 0.7000
# plot of Global, Northern Hemisphere, Southern Hemisphere deviation from corresponding mean by year
ggplot(temp_recent,aes(x=Year))+
geom_line(aes(y=Glob, color="Global"))+
geom_line(aes(y=NHem, color="Northern Hemisphere"))+
geom_line(aes(y=SHem, color="Southern Hemisphere"))+
<<<<<<< HEAD
ylab("Temperature Anomolies")+
geom_smooth(method='lm', aes(Year,Glob), colour="red", size=0.5, se=FALSE)+
geom_smooth(method='lm', aes(Year,NHem),colour="green", size=0.5, se=FALSE)+
geom_smooth(method='lm', aes(Year,SHem), colour="blue", size=0.5, se=FALSE)
CO2 <- read_csv("~/Desktop/Bayes Capstone/CO2.csv")
## Parsed with column specification:
## cols(
## year = col_integer(),
## month = col_integer(),
## decimal = col_double(),
## average = col_double(),
## trend = col_double()
## )
# Alter CO2 to remove unnecessary variable trend
CO2 <- subset(CO2, select = -trend)
# Create subset of CO2 data from year 2000
# to show only fluxuations in CO2 levels throughout one year
CO2_2000 <- subset(CO2, year == 2000)
head(CO2_2000)
## # A tibble: 6 x 4
## year month decimal average
## <int> <int> <dbl> <dbl>
## 1 2000 1 2000.042 369.21
## 2 2000 2 2000.125 369.46
## 3 2000 3 2000.208 369.78
## 4 2000 4 2000.292 370.18
## 5 2000 5 2000.375 370.08
## 6 2000 6 2000.458 369.17
# Plot this example
plot(CO2_2000$average ~ CO2_2000$month)
# Create time series to show variation over months and years
Average <- ts(CO2$average, frequency = 12, start = 1980)
# Plot and add axis labels
v1 <- c(1980,1985,1990,1995,2000,2005,2010,2015)
v2 <- c(1980,1985,1990,1995,2000,2005,2010,2015)
plot(Average)
axis(side = 1,
at = v1,
labels = v2,
tck=-.05)
# Create colored time series
CO2$Date <- as.yearmon(paste(CO2$year, CO2$month), "%Y %m")
# Plot this time series
ggplot(CO2, aes(Date, Average)) + geom_line( aes(colour = year))
## Don't know how to automatically pick scale for object of type yearmon. Defaulting to continuous.
## Don't know how to automatically pick scale for object of type ts. Defaulting to continuous.
# Add trend line
ggplot(CO2, aes(Date, Average)) + geom_line( aes(colour = year)) + geom_smooth()
## Don't know how to automatically pick scale for object of type yearmon. Defaulting to continuous.
## Don't know how to automatically pick scale for object of type ts. Defaulting to continuous.
## `geom_smooth()` using method = 'loess'
# Show increasing CO2 over the years by month
ggplot(CO2, aes(x = month, y = average, color = year)) + geom_point()
#ggplot(hurricanes_categories, aes(x = YEAR, y = TOTAL_HURRICANES, color = CAT, fill = CAT)) +
#geom_bar(stat = "identity") +
#geom_abline(data=temp_data, aes(x=Year, y=Glob))
# Show increasing CO2 over the years by month
library(ggplot2)
library(lubridate)
CO2 <- read_csv("~/Desktop/Bayes Capstone/CO2.csv")
## Parsed with column specification:
## cols(
## year = col_integer(),
## month = col_integer(),
## decimal = col_double(),
## average = col_double(),
## trend = col_double()
## )
ggplot(CO2, aes(x = month, y = average, color = year)) + geom_point()
# Alter CO2 to remove unnecessary variable trend
CO2 <- subset(CO2, select = -trend)
# Create subset of CO2 data from year 2000
# to show only fluxuations in CO2 levels throughout one year
CO2_2000 <- subset(CO2, year == 2000)
head(CO2_2000)
## # A tibble: 6 x 4
## year month decimal average
## <int> <int> <dbl> <dbl>
## 1 2000 1 2000.042 369.21
## 2 2000 2 2000.125 369.46
## 3 2000 3 2000.208 369.78
## 4 2000 4 2000.292 370.18
## 5 2000 5 2000.375 370.08
## 6 2000 6 2000.458 369.17
# Plot this example
plot(CO2_2000$average ~ CO2_2000$month)
# Create time series to show variation over months and years
Average <- ts(CO2$average, frequency = 12, start = 1980)
# Plot and add axis labels
v1 <- c(1980,1985,1990,1995,2000,2005,2010,2015)
v2 <- c(1980,1985,1990,1995,2000,2005,2010,2015)
plot(Average)
axis(side = 1,
at = v1,
labels = v2,
tck=-.05)
# Create colored time series
CO2$Date <- as.yearmon(paste(CO2$year, CO2$month), "%Y %m")
ggplot(CO2, aes(Date, Average)) + geom_line(aes(colour = year))
## Don't know how to automatically pick scale for object of type yearmon. Defaulting to continuous.
## Don't know how to automatically pick scale for object of type ts. Defaulting to continuous.
# Show increasing CO2 over the years by month
ggplot(CO2, aes(x = month, y = average, color = year)) + geom_point()
NAO_index1 <- read_csv("NAO_index_monthly.csv",
col_types = cols(Apr = col_number(),
Aug = col_number(), Dec = col_number(),
Feb = col_number(), Jan = col_number(),
Jul = col_number(), Jun = col_number(),
Mar = col_number(), May = col_number(),
Nov = col_number(), Oct = col_number(),
Sep = col_number()))
## Warning: Missing column names filled in: 'X1' [1]
NAO_index2 <- NAO_index1[, -c(14)]
NAO_index3 <- NAO_index2 %>%
rename(year = X1) %>%
mutate(Jul = replace(Jul, Jul < -50| Jul > 50, "")) %>%
mutate(Aug = replace(Aug, Aug < -50| Aug > 50, "")) %>%
mutate(Sep = replace(Sep, Sep < -50| Sep > 50, "")) %>%
mutate(Oct = replace(Oct, Oct < -50| Oct > 50, "")) %>%
mutate(Nov = replace(Nov, Nov < -50| Nov > 50, "")) %>%
mutate(Dec = replace(Dec, Dec < -50| Dec > 50, ""))
NAO_index <- NAO_index3[-nrow(NAO_index3),]
dim(NAO_index)
## [1] 153 13
names(NAO_index)
## [1] "year" "Jan" "Feb" "Mar" "Apr" "May" "Jun" "Jul" "Aug" "Sep"
## [11] "Oct" "Nov" "Dec"
head(NAO_index)
## # A tibble: 6 x 13
## year Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <chr> <chr> <chr> <chr> <chr>
## 1 1865 -0.6 -1.2 0.2 -0.2 -0.4 0.0 0.5 1.5 1.8 -2 -0.9
## 2 1866 0.5 0.8 -0.6 -2.3 -2.0 0.9 -0.5 -0.2 2.4 -0.3 -0.5
## 3 1867 -3.5 1.1 -4.3 1.8 -4.2 0.1 -2 1.9 1.4 2.2 -3.5
## 4 1868 0.7 3.0 3.6 1.7 2.3 3.1 0.4 1.5 -2.8 3.5 -1.8
## 5 1869 1.0 2.5 0.2 -0.2 -2.7 -1.9 -0.3 -1 -0.4 -1.4 1.4
## 6 1870 0.4 -2.8 -2.0 1.1 1.5 0.3 0.5 -3.6 0.2 0.9 -2.1
## # ... with 1 more variables: Dec <chr>
summary(NAO_index)
## year Jan Feb Mar
## Length:153 Min. :-5.9000 Min. :-4.9000 Min. :-4.3000
## Class :character 1st Qu.:-0.9000 1st Qu.:-1.0000 1st Qu.:-1.2000
## Mode :character Median : 0.5000 Median : 0.5000 Median : 0.3000
## Mean : 0.1052 Mean : 0.1641 Mean : 0.1719
## 3rd Qu.: 1.4000 3rd Qu.: 1.4000 3rd Qu.: 1.4000
## Max. : 3.5000 Max. : 3.7000 Max. : 4.4000
## Apr May Jun Jul
## Min. :-3.30000 Min. :-4.20000 Min. :-4.000 Length:153
## 1st Qu.:-1.30000 1st Qu.:-1.20000 1st Qu.:-1.400 Class :character
## Median :-0.10000 Median :-0.20000 Median :-0.100 Mode :character
## Mean : 0.01307 Mean :-0.02353 Mean :-0.132
## 3rd Qu.: 1.20000 3rd Qu.: 1.20000 3rd Qu.: 1.000
## Max. : 4.10000 Max. : 5.20000 Max. : 3.500
## Aug Sep Oct
## Length:153 Length:153 Length:153
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
## Nov Dec
## Length:153 Length:153
## Class :character Class :character
## Mode :character Mode :character
##
##
##
# remove 2017 (data for year is incomplete)
NAO_index <- head(NAO_index, -1)
# reshape table
NAO_index <- melt(NAO_index, id=c("year"))
colnames(NAO_index)[2] <- "month"
colnames(NAO_index)[3] <- "index"
NAO_index$year <- as.factor(NAO_index$year)
NAO_index$index <- as.numeric(NAO_index$index)
# group by month
NAO_avgMonth <- aggregate(NAO_index[, 3], list(NAO_index$month), mean)
# plot
ggplot(data = NAO_avgMonth, aes(x = Group.1, y = x)) + geom_point(color = "blue") + xlab("month") + ylab("NAO index, averaged across all years")